import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
import scipy.stats as st
from sklearn import ensemble, tree, linear_model
import missingno as msno
Read the AB_NYC_2019.csv file
AirBnB = pd.read_csv("AB_NYC_2019.csv")
Describe the dataset
AirBnB.describe()
| id | host_id | latitude | longitude | price | minimum_nights | number_of_reviews | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 4.889500e+04 | 4.889500e+04 | 48895.000000 | 48895.000000 | 48895.000000 | 48895.000000 | 48895.000000 | 38843.000000 | 48895.000000 | 48895.000000 |
| mean | 1.901714e+07 | 6.762001e+07 | 40.728949 | -73.952170 | 152.720687 | 7.029962 | 23.274466 | 1.373221 | 7.143982 | 112.781327 |
| std | 1.098311e+07 | 7.861097e+07 | 0.054530 | 0.046157 | 240.154170 | 20.510550 | 44.550582 | 1.680442 | 32.952519 | 131.622289 |
| min | 2.539000e+03 | 2.438000e+03 | 40.499790 | -74.244420 | 0.000000 | 1.000000 | 0.000000 | 0.010000 | 1.000000 | 0.000000 |
| 25% | 9.471945e+06 | 7.822033e+06 | 40.690100 | -73.983070 | 69.000000 | 1.000000 | 1.000000 | 0.190000 | 1.000000 | 0.000000 |
| 50% | 1.967728e+07 | 3.079382e+07 | 40.723070 | -73.955680 | 106.000000 | 3.000000 | 5.000000 | 0.720000 | 1.000000 | 45.000000 |
| 75% | 2.915218e+07 | 1.074344e+08 | 40.763115 | -73.936275 | 175.000000 | 5.000000 | 24.000000 | 2.020000 | 2.000000 | 227.000000 |
| max | 3.648724e+07 | 2.743213e+08 | 40.913060 | -73.712990 | 10000.000000 | 1250.000000 | 629.000000 | 58.500000 | 327.000000 | 365.000000 |
Look at the top and bottom 4 rows of data and look at the shape of the data frame (number of rows and columns)
AirBnB.head()
| id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2539 | Clean & quiet apt home by the park | 2787 | John | Brooklyn | Kensington | 40.64749 | -73.97237 | Private room | 149 | 1 | 9 | 2018-10-19 | 0.21 | 6 | 365 |
| 1 | 2595 | Skylit Midtown Castle | 2845 | Jennifer | Manhattan | Midtown | 40.75362 | -73.98377 | Entire home/apt | 225 | 1 | 45 | 2019-05-21 | 0.38 | 2 | 355 |
| 2 | 3647 | THE VILLAGE OF HARLEM....NEW YORK ! | 4632 | Elisabeth | Manhattan | Harlem | 40.80902 | -73.94190 | Private room | 150 | 3 | 0 | NaN | NaN | 1 | 365 |
| 3 | 3831 | Cozy Entire Floor of Brownstone | 4869 | LisaRoxanne | Brooklyn | Clinton Hill | 40.68514 | -73.95976 | Entire home/apt | 89 | 1 | 270 | 2019-07-05 | 4.64 | 1 | 194 |
| 4 | 5022 | Entire Apt: Spacious Studio/Loft by central park | 7192 | Laura | Manhattan | East Harlem | 40.79851 | -73.94399 | Entire home/apt | 80 | 10 | 9 | 2018-11-19 | 0.10 | 1 | 0 |
AirBnB.tail()
| id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 48890 | 36484665 | Charming one bedroom - newly renovated rowhouse | 8232441 | Sabrina | Brooklyn | Bedford-Stuyvesant | 40.67853 | -73.94995 | Private room | 70 | 2 | 0 | NaN | NaN | 2 | 9 |
| 48891 | 36485057 | Affordable room in Bushwick/East Williamsburg | 6570630 | Marisol | Brooklyn | Bushwick | 40.70184 | -73.93317 | Private room | 40 | 4 | 0 | NaN | NaN | 2 | 36 |
| 48892 | 36485431 | Sunny Studio at Historical Neighborhood | 23492952 | Ilgar & Aysel | Manhattan | Harlem | 40.81475 | -73.94867 | Entire home/apt | 115 | 10 | 0 | NaN | NaN | 1 | 27 |
| 48893 | 36485609 | 43rd St. Time Square-cozy single bed | 30985759 | Taz | Manhattan | Hell's Kitchen | 40.75751 | -73.99112 | Shared room | 55 | 1 | 0 | NaN | NaN | 6 | 2 |
| 48894 | 36487245 | Trendy duplex in the very heart of Hell's Kitchen | 68119814 | Christophe | Manhattan | Hell's Kitchen | 40.76404 | -73.98933 | Private room | 90 | 7 | 0 | NaN | NaN | 1 | 23 |
AirBnB.shape
(48895, 16)
Examine numerical features in the dataset
numeric_features = AirBnB.select_dtypes(include=[np.number])
numeric_features.columns
Index(['id', 'host_id', 'latitude', 'longitude', 'price', 'minimum_nights',
'number_of_reviews', 'reviews_per_month',
'calculated_host_listings_count', 'availability_365'],
dtype='object')
Examine categorical features in the dataset
categorical_features = AirBnB.select_dtypes(include=[object])
categorical_features.columns
Index(['name', 'host_name', 'neighbourhood_group', 'neighbourhood',
'room_type', 'last_review'],
dtype='object')
Estimate Skewness and Kurtosis
AirBnB.skew()
C:\Users\leigh\AppData\Local\Temp\ipykernel_13940\2556107286.py:1: FutureWarning: The default value of numeric_only in DataFrame.skew is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. AirBnB.skew()
id -0.090257 host_id 1.206214 latitude 0.237167 longitude 1.284210 price 19.118939 minimum_nights 21.827275 number_of_reviews 3.690635 reviews_per_month 3.130189 calculated_host_listings_count 7.933174 availability_365 0.763408 dtype: float64
AirBnB.kurt()
C:\Users\leigh\AppData\Local\Temp\ipykernel_13940\772064313.py:1: FutureWarning: The default value of numeric_only in DataFrame.kurt is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. AirBnB.kurt()
id -1.227748 host_id 0.169106 latitude 0.148845 longitude 5.021646 price 585.672879 minimum_nights 854.071662 number_of_reviews 19.529788 reviews_per_month 42.493469 calculated_host_listings_count 67.550888 availability_365 -0.997534 dtype: float64
Show number of missing values per variable
AirBnB.isna().sum()
id 0 name 16 host_id 0 host_name 21 neighbourhood_group 0 neighbourhood 0 latitude 0 longitude 0 room_type 0 price 0 minimum_nights 0 number_of_reviews 0 last_review 10052 reviews_per_month 10052 calculated_host_listings_count 0 availability_365 0 dtype: int64
Replace null values with appropriate values:
AirBnB['name'].fillna('Replaced name', inplace=True)
AirBnB['host_name'].fillna('Replaced host name', inplace=True)
AirBnB['last_review'].fillna(AirBnB['last_review'].mode().iloc[0], inplace=True)
AirBnB['reviews_per_month'].fillna(AirBnB['reviews_per_month'].mean(), inplace=True)
Now re-check to make sure there are no missing values
AirBnB.isna().sum()
id 0 name 0 host_id 0 host_name 0 neighbourhood_group 0 neighbourhood 0 latitude 0 longitude 0 room_type 0 price 0 minimum_nights 0 number_of_reviews 0 last_review 0 reviews_per_month 0 calculated_host_listings_count 0 availability_365 0 dtype: int64
Correlation matrix with price
correlation = numeric_features.corr()
correlation['price'].sort_values(ascending = False)
price 1.000000 availability_365 0.081829 calculated_host_listings_count 0.057472 minimum_nights 0.042799 latitude 0.033939 host_id 0.015309 id 0.010619 reviews_per_month -0.030608 number_of_reviews -0.047954 longitude -0.150019 Name: price, dtype: float64
To explore further we will start with the following visualisation methods to analyze the data better:
f , ax = plt.subplots(figsize = (14,12))
plt.title('Correlation of Numeric Features',y=1,size=16)
sns.heatmap(correlation,square = True)
<Axes: title={'center': 'Correlation of Numeric Features'}>
The only strong correlations appear to be number_of_reviews with id, which is not likely to be helpful, and number_of_review with reviews_per_month.
k= 11
cols = correlation.nlargest(k,'price')['price'].index
print(cols)
cm = np.corrcoef(AirBnB[cols].values.T)
f , ax = plt.subplots(figsize = (14,12))
sns.heatmap(cm, vmax=.8, linewidths=0.01,square=True,annot=True,cmap='viridis',
linecolor="white",xticklabels = cols.values ,annot_kws = {'size':12},yticklabels = cols.values)
Index(['price', 'availability_365', 'calculated_host_listings_count',
'minimum_nights', 'latitude', 'host_id', 'id', 'reviews_per_month',
'number_of_reviews', 'longitude'],
dtype='object')
<Axes: >
Price isn't highly correlated with any variable.
Visualisation of 'OverallQual','TotalBsmtSF','GrLivArea','GarageArea','FullBath','YearBuilt','YearRemodAdd' features with respect to SalePrice in the form of pair plot & scatter pair plot for better understanding.
sns.set()
columns = ['price','latitude','longitude','minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count']
sns.pairplot(AirBnB[columns],size = 2 ,kind ='scatter',diag_kind='kde')
plt.show()
C:\Users\leigh\anaconda3\lib\site-packages\seaborn\axisgrid.py:2095: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
Unsuprisingly the scatterplot for longitude and latitide align visually with the map of New York, so we'll overlay onto the map later.
Scatter plots between price, longitude and latitude
fig, ((ax1), (ax2),(ax3)) = plt.subplots(nrows=3, ncols=1, figsize=(14,10))
price_longitude_scatter_plot = pd.concat([AirBnB['price'],AirBnB['longitude']],axis = 1)
sns.regplot(x='longitude',y = 'price',data = price_longitude_scatter_plot,scatter= True, fit_reg=True, ax=ax1)
price_latitude_scatter_plot = pd.concat([AirBnB['price'],AirBnB['latitude']],axis = 1)
sns.regplot(x='latitude',y = 'price',data = price_latitude_scatter_plot,scatter= True, fit_reg=True, ax=ax2)
latitude_longitude_scatter_plot = pd.concat([AirBnB['latitude'],AirBnB['longitude']],axis = 1)
sns.regplot(x='longitude',y = 'latitude',data = latitude_longitude_scatter_plot,scatter= True, fit_reg=True, ax=ax3)
<Axes: xlabel='longitude', ylabel='latitude'>
Still very little to show relating price with location.
Bar chart of median price by neighborhood group
neighbourhood_group = AirBnB.pivot_table(index ='neighbourhood_group',values = 'price', aggfunc = np.median)
neighbourhood_group.plot(kind = 'bar',color = 'blue')
plt.xlabel('Neighborhood group')
plt.ylabel('Median price')
Text(0, 0.5, 'Median price')
Box plot model year and cylinders
neighbourhood_group = AirBnB.pivot_table(index ='neighbourhood_group',values = 'price', aggfunc = np.mean)
neighbourhood_group.plot(kind = 'bar',color = 'blue')
plt.xlabel('Neighborhood group')
plt.ylabel('Mean price')
Text(0, 0.5, 'Mean price')
Box plot price by neighborhood group
var = 'neighbourhood_group'
data = pd.concat([AirBnB['price'], AirBnB[var]], axis=1)
f, ax = plt.subplots(figsize=(12, 8))
fig = sns.boxplot(x=var, y="price", data=data)
fig.axis(ymin=0, ymax=500);
Bar chart of number of properties per neighborhood group
neighbourhood_group = AirBnB.pivot_table(index ='neighbourhood_group',values = 'id', aggfunc = 'count')
neighbourhood_group.plot(kind = 'bar',color = 'blue')
plt.xlabel('Neighborhood group')
plt.ylabel('Number of properties')
Text(0, 0.5, 'Number of properties')
Box plot price by neighborhood in Manhattan
Manhattan = AirBnB[AirBnB['neighbourhood_group'] == 'Manhattan']
var = 'neighbourhood'
data = pd.concat([Manhattan['price'], Manhattan[var]], axis=1)
f, ax = plt.subplots(figsize=(12, 8))
fig = sns.boxplot(x=var, y="price", data=data)
fig.axis(ymin=0, ymax=1000);
plt.xticks(rotation=90)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]),
[Text(0, 0, 'Midtown'),
Text(1, 0, 'Harlem'),
Text(2, 0, 'East Harlem'),
Text(3, 0, 'Murray Hill'),
Text(4, 0, "Hell's Kitchen"),
Text(5, 0, 'Upper West Side'),
Text(6, 0, 'Chinatown'),
Text(7, 0, 'West Village'),
Text(8, 0, 'Chelsea'),
Text(9, 0, 'Inwood'),
Text(10, 0, 'East Village'),
Text(11, 0, 'Lower East Side'),
Text(12, 0, 'Kips Bay'),
Text(13, 0, 'SoHo'),
Text(14, 0, 'Upper East Side'),
Text(15, 0, 'Washington Heights'),
Text(16, 0, 'Financial District'),
Text(17, 0, 'Morningside Heights'),
Text(18, 0, 'NoHo'),
Text(19, 0, 'Flatiron District'),
Text(20, 0, 'Roosevelt Island'),
Text(21, 0, 'Greenwich Village'),
Text(22, 0, 'Little Italy'),
Text(23, 0, 'Two Bridges'),
Text(24, 0, 'Nolita'),
Text(25, 0, 'Gramercy'),
Text(26, 0, 'Theater District'),
Text(27, 0, 'Tribeca'),
Text(28, 0, 'Battery Park City'),
Text(29, 0, 'Civic Center'),
Text(30, 0, 'Stuyvesant Town'),
Text(31, 0, 'Marble Hill')])
Bar chart of number of properties per neighborhood in Manhattan
neighbourhood_group = Manhattan.pivot_table(index ='neighbourhood',values = 'id', aggfunc = 'count')
neighbourhood_group.plot(kind = 'bar',color = 'blue')
plt.xlabel('Neighborhood')
plt.ylabel('Number of properties')
Text(0, 0.5, 'Number of properties')
Plot the properties on the map, grouped by neighbourhood_group
plt.figure(figsize=(12,8))
plt.style.use('fast')
# Set the boundary of the map using longitude and latitude obtained from Google Maps
coordinates = (-74.2623, -73.6862, 40.4943, 40.9144)
map = mpimg.imread("New_York_City.jpg")
plt.imshow(map,extent=coordinates)
groups = AirBnB.groupby('neighbourhood_group')
for name,group in groups :
plt.scatter(group['longitude'],group['latitude'], label=name, edgecolors='black')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Properties by neighborhood')
plt.legend()
<matplotlib.legend.Legend at 0x2be60365240>
Now plot the properties using a colour gradiant for price
plt.figure(figsize=(12,8))
plt.imshow(map, extent=coordinates)
plt.scatter(AirBnB.longitude, AirBnB.latitude, c=AirBnB.price, cmap='rainbow', edgecolors='black')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Properties by price')
plt.colorbar()
<matplotlib.colorbar.Colorbar at 0x2be60599e10>
Since most properties were low making higher value ones difficult to see, plot only those over $800
plt.figure(figsize=(12,8))
plt.imshow(map, extent=coordinates)
high_price = AirBnB[AirBnB['price'] > 800]
plt.scatter(high_price.longitude, high_price.latitude, c=high_price.price, cmap='rainbow', edgecolors='black')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Properties over $800 by price')
plt.colorbar()
<matplotlib.colorbar.Colorbar at 0x2be62566ec0>
We can start to see where the very expensive properties are located, which is Manhatten as expected from the earlier analysis of price by neighborhood_region, but also Queens which was unexpected.
Now to see the price distribution of the lower cost properties, plot only those under $250
plt.figure(figsize=(12,8))
plt.imshow(map, extent=coordinates)
low_price = AirBnB[AirBnB['price'] < 250]
plt.scatter(low_price.longitude, low_price.latitude, c=low_price.price, cmap='rainbow', edgecolors='black')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Properties under $250 by price')
plt.colorbar()
<matplotlib.colorbar.Colorbar at 0x2be61bf28c0>
Once again the higher priced properties at the lower end of teh price range are clustered in Manhatten.
Find the fiive figure summary for price so we can plot the middle 50%
AirBnB.price.describe()
count 48895.000000 mean 152.720687 std 240.154170 min 0.000000 25% 69.000000 50% 106.000000 75% 175.000000 max 10000.000000 Name: price, dtype: float64
Now to see the price distribution of the middle 50% of properties by cost, plot only those >= $69 and <= $175
plt.figure(figsize=(12,8))
plt.imshow(map, extent=coordinates)
mid_price = AirBnB[AirBnB['price'] >= 69]
mid_price = mid_price[mid_price['price'] <= 175]
plt.scatter(mid_price.longitude, mid_price.latitude, c=mid_price.price, cmap='rainbow', edgecolors='black')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Properties >= $69 and <= $175 by price')
plt.colorbar()
<matplotlib.colorbar.Colorbar at 0x2be63d23340>
In the mid-price range the higher prices are again clusted in Manhatten, but they appear to be distributed acros the other regions too.